package_Vector <- c(
"dplyr",
"data.table",
"ggplot2",
"corrplot",
"reshape2",
"plyr",
"plotly",
"caret"
)
for (package in package_Vector) {
if (!require(package, character.only=T, quietly=T)) {
install.packages(package)
library(package, character.only=T)
}
}
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## corrplot 0.84 loaded
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
##
## Attaching package: 'plotly'
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
#pobierz plik
if (file.exists(paste(getwd(),"/","sledzie.csv",sep = ""))==FALSE) {
download.file("http://www.cs.put.poznan.pl/dbrzezinski/teaching/sphd/sledzie.csv", paste(getwd(),"/","sledzie.csv",sep = ""))
}
#czytaj dane
daneSurowe<- read.csv("sledzie.csv",na.strings = c("?"))
#Zastap NA wartosciami srednimi dla grup
srednieGrupowe <- ddply(daneSurowe,c("totaln","xmonth"),numcolwise(mean,na.rm=TRUE))
f1 <- function(x) {
for (icol in 1:ncol(x)) {
ifelse(!is.na(x[,icol]),(x[,icol]),(x[,icol] <- subset(srednieGrupowe, (srednieGrupowe$totaln == x$totaln) & (srednieGrupowe$xmonth == x$xmonth),colnames(x)[icol])))
}
return (x)
}
daneBezNA <- ddply(daneSurowe,1,f1)
daneBezOutliers <- as.data.frame(apply(daneBezNA[,c("length", "cfin1", "cfin2", "chel1", "chel2", "lcop1", "lcop2")],2,function(x) {ifelse((x>(mean(x)+3*sd(x))) | (x<(mean(x)-3*sd(x))),NA,x)}))
daneBezOutliers <- daneBezNA[complete.cases(daneBezOutliers),]
dane <- daneBezOutliers
PoniĹĽej znajdujÄ… siÄ™ podstawowe statystyki opisowe dla zbioru danych
summary(dane[,c("length","cfin1","cfin2","chel1","chel2","lcop1","lcop2","fbar","recr","cumf","totaln","sst","sal","xmonth","nao")])
## length cfin1 cfin2 chel1
## Min. :20.5 Min. :0.0000 Min. : 0.0000 Min. : 0.000
## 1st Qu.:24.0 1st Qu.:0.0000 1st Qu.: 0.2500 1st Qu.: 2.267
## Median :25.0 Median :0.1111 Median : 0.7012 Median : 5.095
## Mean :25.2 Mean :0.3929 Mean : 1.5154 Mean : 7.263
## 3rd Qu.:26.5 3rd Qu.:0.3333 3rd Qu.: 1.5805 3rd Qu.:11.500
## Max. :30.0 Max. :3.1446 Max. :13.1434 Max. :36.333
## chel2 lcop1 lcop2 fbar
## Min. : 5.238 Min. : 0.3074 Min. : 7.849 Min. :0.0680
## 1st Qu.:13.427 1st Qu.: 2.5030 1st Qu.:17.808 1st Qu.:0.2010
## Median :20.158 Median : 5.9167 Median :24.859 Median :0.3180
## Mean :20.228 Mean :10.1994 Mean :26.609 Mean :0.3169
## 3rd Qu.:26.812 3rd Qu.:16.3656 3rd Qu.:36.190 3rd Qu.:0.4030
## Max. :39.568 Max. :41.1667 Max. :64.861 Max. :0.8490
## recr cumf totaln sst
## Min. : 140515 Min. :0.06833 Min. : 144137 Min. :12.77
## 1st Qu.: 364794 1st Qu.:0.14809 1st Qu.: 306160 1st Qu.:13.63
## Median : 459347 Median :0.22438 Median : 539558 Median :13.87
## Mean : 538305 Mean :0.22252 Mean : 521658 Mean :13.91
## 3rd Qu.: 724151 3rd Qu.:0.28116 3rd Qu.: 730351 3rd Qu.:14.21
## Max. :1380210 Max. :0.39801 Max. :1015595 Max. :14.73
## sal xmonth nao
## Min. :35.40 Min. : 1.00 Min. :-3.7800
## 1st Qu.:35.51 1st Qu.: 5.00 1st Qu.:-1.5400
## Median :35.51 Median : 8.00 Median : 0.3400
## Mean :35.51 Mean : 7.22 Mean : 0.1526
## 3rd Qu.:35.52 3rd Qu.: 9.00 3rd Qu.: 1.8000
## Max. :35.61 Max. :12.00 Max. : 5.0800
meltData <- melt(dane[,c("length", "cfin1", "cfin2", "chel1", "chel2", "lcop1", "lcop2", "cumf", "sst", "sal", "nao")],na.rm = TRUE)
## No id variables; using all as measure variables
p <- ggplot(meltData, aes(factor(variable),value))
p + geom_boxplot() + facet_wrap(~variable, scale = "free")
corelation <- cor((dane[,c("length","cfin1","cfin2","chel1","chel2","lcop1","lcop2","fbar","recr","cumf", "xmonth", "sst", "nao", "sal")]))
corrplot(corelation, col = colorRampPalette(c("blue","white","red"))(5), title = "Korelacje pomiędzy zmiennymi", method = "ellipse", outline = T, diag=FALSE, addgrid.col = "darkgray", order="hclust", mar = c(4,0,4,0), addrect = 4, rect.col = "black", rect.lwd = 5,cl.pos = "b", tl.col = "indianred4")
#plot_ly(dane,x=dane$fbar,y=dane$length, type ="scatter", mode ="markers")
dane$fbarFactor <- as.factor(ifelse(dane$fbar<quantile(dane$fbar,0.125),"Q.125",ifelse(dane$fbar>=quantile(dane$fbar,0.125) & dane$fbar<quantile(dane$fbar,0.25),"Q.250",ifelse(dane$fbar>=quantile(dane$fbar,0.25) & dane$fbar<quantile(dane$fbar,0.375),"Q.375",ifelse(dane$fbar>=quantile(dane$fbar,0.375) & dane$fbar<quantile(dane$fbar,0.50),"Q.500",ifelse(dane$fbar>=quantile(dane$fbar,0.50),"Q.99","0"))))))
p <- plot_ly(dane,
x = ~fbarFactor,
y = ~length,
split= ~fbarFactor,
type='box'
) %>% layout(title = "Długość śledzia w czasie jego życia",
xaxis = list(title = "Kwantyl"),
yaxis = list (title = "Długość śledzia"))
p
#plot_ly(dane,x=dane$fbar,y=dane$length, type ="scatter", mode ="markers")
dane$newlength <- (round(dane$length/0.5)*0.5)
set.seed(123)
inTraining <- createDataPartition(y=dane$length,p=0.75,list=FALSE)
ctrl <- caret::trainControl(method="repeatedcv",number = 2, repeats = 5)
fit <- train(dane[inTraining,-which(names(dane) %in% c("X","length","newlength","totaln","fbarFactor"))],
dane[inTraining,which(names(dane) == "length")],
method= "enet",
trControl = ctrl,
ntree =10,
importance=T)
fit
## Elasticnet
##
## 36377 samples
## 13 predictor
##
## No pre-processing
## Resampling: Cross-Validated (2 fold, repeated 5 times)
## Summary of sample sizes: 18188, 18189, 18189, 18188, 18188, 18189, ...
## Resampling results across tuning parameters:
##
## lambda fraction RMSE Rsquared MAE
## 0e+00 0.050 1.568187 0.1893865 1.268536
## 0e+00 0.525 1.390346 0.2858234 1.114684
## 0e+00 1.000 1.365824 0.3055324 1.091980
## 1e-04 0.050 1.568234 0.1893865 1.268574
## 1e-04 0.525 1.390396 0.2857795 1.114740
## 1e-04 1.000 1.365823 0.3055324 1.091991
## 1e-01 0.050 1.589116 0.1893865 1.285737
## 1e-01 0.525 1.420142 0.2569521 1.143091
## 1e-01 1.000 1.381125 0.2908730 1.107573
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were fraction = 1 and lambda = 1e-04.
rfClasses <- predict(fit, newdata = dane[-inTraining,-which(names(dane) %in% c("X","length","newlength","totaln","fbarFactor"))],type="raw")
rfClasses2 <- (round(rfClasses/0.5)*0.5)
confusionMatrix(data = factor(rfClasses2,levels = unique(dane[-inTraining,which(names(dane) == "newlength")])),
reference = factor(dane[-inTraining,which(names(dane) == "newlength")]))
## Warning in confusionMatrix.default(data = factor(rfClasses2, levels =
## unique(dane[-inTraining, : Levels are not in the same order for reference
## and data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 20.5 21 21.5 22 22.5 23 23.5 24 24.5 25 25.5 26 26.5 27
## 20.5 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 21 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 21.5 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 22 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 22.5 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 23 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 23.5 7 11 32 104 145 184 172 170 117 103 49 26 9 3
## 24 10 40 40 82 119 127 129 119 74 43 30 17 1 1
## 24.5 0 4 12 35 47 103 119 162 177 172 150 111 101 46
## 25 0 1 1 9 24 67 116 155 215 273 254 245 163 128
## 25.5 1 4 19 33 52 133 214 370 510 504 548 468 377 219
## 26 0 0 0 4 9 29 54 117 200 267 305 377 366 311
## 26.5 0 0 0 0 3 3 4 21 31 19 19 13 25 27
## 27 0 0 0 0 0 2 3 8 5 19 20 23 49 74
## 27.5 0 0 0 0 0 0 1 1 2 2 4 7 8 4
## 28 0 0 0 0 0 1 4 2 6 4 4 10 9 17
## 28.5 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 29 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 29.5 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 30 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Reference
## Prediction 27.5 28 28.5 29 29.5 30
## 20.5 0 0 0 0 0 0
## 21 0 0 0 0 0 0
## 21.5 0 0 0 0 0 0
## 22 0 0 0 0 0 0
## 22.5 0 0 0 0 0 0
## 23 0 0 0 0 0 0
## 23.5 0 1 0 0 0 0
## 24 2 0 0 0 0 0
## 24.5 17 6 4 1 0 0
## 25 53 29 12 11 1 2
## 25.5 162 99 66 22 8 3
## 26 264 164 78 21 7 3
## 26.5 20 14 9 2 0 0
## 27 70 54 44 35 6 0
## 27.5 4 7 5 3 0 0
## 28 10 3 2 0 0 0
## 28.5 0 0 0 0 0 0
## 29 0 0 0 0 0 0
## 29.5 0 0 0 0 0 0
## 30 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.1462
## 95% CI : (0.1399, 0.1526)
## No Information Rate : 0.116
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.0471
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 20.5 Class: 21 Class: 21.5 Class: 22
## Sensitivity 0.000000 0.000000 0.000000 0.00000
## Specificity 1.000000 1.000000 1.000000 1.00000
## Pos Pred Value NaN NaN NaN NaN
## Neg Pred Value 0.998515 0.995051 0.991421 0.97798
## Prevalence 0.001485 0.004949 0.008579 0.02202
## Detection Rate 0.000000 0.000000 0.000000 0.00000
## Detection Prevalence 0.000000 0.000000 0.000000 0.00000
## Balanced Accuracy 0.500000 0.500000 0.500000 0.50000
## Class: 22.5 Class: 23 Class: 23.5 Class: 24
## Sensitivity 0.00000 0.00000 0.21078 0.105778
## Specificity 1.00000 1.00000 0.91501 0.934988
## Pos Pred Value NaN NaN 0.15181 0.142686
## Neg Pred Value 0.96709 0.94647 0.94140 0.910887
## Prevalence 0.03291 0.05353 0.06731 0.092799
## Detection Rate 0.00000 0.00000 0.01419 0.009816
## Detection Prevalence 0.00000 0.00000 0.09346 0.068795
## Balanced Accuracy 0.50000 0.50000 0.56290 0.520383
## Class: 24.5 Class: 25 Class: 25.5 Class: 26
## Sensitivity 0.1324 0.19417 0.3962 0.2907
## Specificity 0.8989 0.86134 0.6961 0.7969
## Pos Pred Value 0.1397 0.15520 0.1438 0.1464
## Neg Pred Value 0.8931 0.89068 0.8995 0.9036
## Prevalence 0.1103 0.11598 0.1141 0.1070
## Detection Rate 0.0146 0.02252 0.0452 0.0311
## Detection Prevalence 0.1045 0.14510 0.3144 0.2125
## Balanced Accuracy 0.5157 0.52775 0.5462 0.5438
## Class: 26.5 Class: 27 Class: 27.5 Class: 28
## Sensitivity 0.022563 0.089157 0.006645 0.0079576
## Specificity 0.983205 0.970070 0.996181 0.9941257
## Pos Pred Value 0.119048 0.179612 0.083333 0.0416667
## Neg Pred Value 0.909091 0.935445 0.950476 0.9689652
## Prevalence 0.091397 0.068465 0.049658 0.0310979
## Detection Rate 0.002062 0.006104 0.000330 0.0002475
## Detection Prevalence 0.017322 0.033985 0.003959 0.0059391
## Balanced Accuracy 0.502884 0.529613 0.501413 0.5010416
## Class: 28.5 Class: 29 Class: 29.5 Class: 30
## Sensitivity 0.00000 0.000000 0.000000 0.0000000
## Specificity 1.00000 1.000000 1.000000 1.0000000
## Pos Pred Value NaN NaN NaN NaN
## Neg Pred Value 0.98185 0.992164 0.998185 0.9993401
## Prevalence 0.01815 0.007836 0.001815 0.0006599
## Detection Rate 0.00000 0.000000 0.000000 0.0000000
## Detection Prevalence 0.00000 0.000000 0.000000 0.0000000
## Balanced Accuracy 0.50000 0.500000 0.500000 0.5000000
varImp(fit,scale="TRUE")
## loess r-squared variable importance
##
## Overall
## sst 100.000
## fbar 22.516
## nao 18.869
## lcop1 11.800
## chel1 6.967
## chel2 5.473
## sal 5.056
## cfin1 2.791
## cumf 1.412
## lcop2 1.401
## recr 1.100
## cfin2 0.389
## xmonth 0.000